import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
data=pd.read_csv("/Users/sahithirao/Desktop/Sem3/DataViz/Project/Data set/dei-global-health-expenditure-world_dataset_Data-dei-global-health-expenditure-world.csv")
data.head(5)
| Unnamed: 0 | country | code | region | income | year | che_gdp | che_pc_usd | hk_gdp | hk_g_gdp | ... | age1_gghed_ppp2020_pc | age1_ext_ppp2020_pc | age1_pvtd_ppp2020_pc | hk_ppp2020_pc | hk_gghed_ppp2020_pc | hk_ext_ppp2020_pc | hk_pvtd_ppp2020_pc | gdp_ppp2020_pc | pfc_ppp2020_pc | gge_ppp2020_pc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Algeria | DZA | AFR | Lower-middle | 2000 | 3.489033 | 62.117695 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9422.670850 | 3917.143779 | 2692.144010 |
| 1 | 1 | Algeria | DZA | AFR | Lower-middle | 2001 | 3.837877 | 67.338501 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9572.762464 | 4184.332806 | 2991.622448 |
| 2 | 2 | Algeria | DZA | AFR | Lower-middle | 2002 | 3.730042 | 66.947601 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9973.432108 | 4386.695961 | 3419.400063 |
| 3 | 3 | Algeria | DZA | AFR | Lower-middle | 2003 | 3.601041 | 76.235474 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10547.676227 | 4270.038642 | 3396.579475 |
| 4 | 4 | Algeria | DZA | AFR | Lower-middle | 2004 | 3.544073 | 93.024330 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10847.517657 | 4182.638819 | 3337.235958 |
5 rows × 3221 columns
#understanding the data
data.shape
(4224, 3221)
#checking null values
data.isna().sum()
Unnamed: 0 0
country 0
code 0
region 0
income 0
...
hk_ext_ppp2020_pc 3540
hk_pvtd_ppp2020_pc 3881
gdp_ppp2020_pc 218
pfc_ppp2020_pc 218
gge_ppp2020_pc 223
Length: 3221, dtype: int64
#dropping the columns if more than 30% of the data is null
data_cleaned = data.dropna(axis=1, thresh=3000)
data_cleaned.shape
(4224, 415)
data_cleaned=data_cleaned.dropna()
#now the rows and columns went down to 2153 and 415 respectively
data_cleaned.shape
(2153, 415)
data_cleaned
| Unnamed: 0 | country | code | region | income | year | che_gdp | che_pc_usd | che | gghed | ... | hf121_ppp2020_pc | hf122_ppp2020_pc | hf13_ppp2020_pc | hf2_ppp2020_pc | hf21_ppp2020_pc | hf22_ppp2020_pc | hf3_ppp2020_pc | gdp_ppp2020_pc | pfc_ppp2020_pc | gge_ppp2020_pc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Algeria | DZA | AFR | Lower-middle | 2000 | 3.489033 | 62.117695 | 143870.265625 | 103533.985000 | ... | 85.652478 | 0.0 | 0.0 | 7.359546 | 2.689580 | 0.182809 | 84.802063 | 9422.670850 | 3917.143779 | 2692.144010 |
| 1 | 1 | Algeria | DZA | AFR | Lower-middle | 2001 | 3.837877 | 67.338501 | 162230.890625 | 123663.777000 | ... | 94.641082 | 0.0 | 0.0 | 7.720256 | 3.113848 | 0.192492 | 79.608176 | 9572.762464 | 4184.332806 | 2991.622448 |
| 2 | 2 | Algeria | DZA | AFR | Lower-middle | 2002 | 3.730042 | 66.947601 | 168702.312500 | 126996.860810 | ... | 96.488899 | 0.0 | 0.0 | 8.160043 | 3.550284 | 0.199467 | 83.795529 | 9973.432108 | 4386.695961 | 3419.400063 |
| 3 | 3 | Algeria | DZA | AFR | Lower-middle | 2003 | 3.601041 | 76.235474 | 189137.484375 | 145057.483429 | ... | 98.706304 | 0.0 | 0.0 | 8.143257 | 3.725214 | 0.200820 | 80.328056 | 10547.676227 | 4270.038642 | 3396.579475 |
| 4 | 4 | Algeria | DZA | AFR | Lower-middle | 2004 | 3.544073 | 93.024330 | 217928.593750 | 155499.678178 | ... | 93.169561 | 0.0 | 0.0 | 9.755374 | 5.556859 | 0.194049 | 100.318061 | 10847.517657 | 4182.638819 | 3337.235958 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4152 | 4152 | Tonga | TON | WPR | Upper-middle | 2016 | 4.818208 | 191.936722 | 44.953880 | 27.868431 | ... | 0.000000 | 0.0 | 0.0 | 40.786449 | 4.164367 | 35.238399 | 16.521059 | 6373.839114 | 6156.507343 | 2370.548952 |
| 4153 | 4153 | Tonga | TON | WPR | Upper-middle | 2017 | 4.855778 | 212.571030 | 49.431820 | 29.589055 | ... | 0.000000 | 0.0 | 0.0 | 45.379721 | 4.301449 | 39.685420 | 17.064897 | 6603.159738 | 6359.166759 | 2620.507401 |
| 4154 | 4154 | Tonga | TON | WPR | Upper-middle | 2018 | 4.806281 | 219.287933 | 51.571392 | 34.362265 | ... | 0.000000 | 0.0 | 0.0 | 33.277787 | 4.537564 | 27.364712 | 18.001625 | 6636.249645 | 6708.234517 | 2634.708619 |
| 4155 | 4155 | Tonga | TON | WPR | Upper-middle | 2019 | 4.746700 | 229.944107 | 55.251587 | 33.663803 | ... | 0.000000 | 0.0 | 0.0 | 41.151269 | 4.327241 | 35.435537 | 17.167221 | 6698.871162 | 6397.297133 | 2578.259691 |
| 4156 | 4156 | Tonga | TON | WPR | Upper-middle | 2020 | 5.317539 | 248.039581 | 60.035019 | 34.282748 | ... | 0.000000 | 0.0 | 0.0 | 51.679572 | 4.505345 | 45.780973 | 17.873803 | 6721.856777 | 6660.602389 | 2589.909387 |
2153 rows × 415 columns
#there are lot of 0 values in the dataset, checking cols having 0s
zeroes=data_cleaned.columns[data_cleaned.eq(0).any()]
data_cleaned[zeroes]
| Unnamed: 0 | ext | vpp_che | ext_che | ext_pc_usd | chi_che | shi_che | chi_pvt_che | vhi_che | fs2 | ... | fs4_ppp2020_pc | fs5_ppp2020_pc | fs7_ppp2020_pc | hf12_ppp2020_pc | hf121_ppp2020_pc | hf122_ppp2020_pc | hf13_ppp2020_pc | hf2_ppp2020_pc | hf21_ppp2020_pc | hf22_ppp2020_pc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 75.081571 | 0.008181 | 0.052187 | 0.032417 | 26.053186 | 26.053186 | 0.0 | 0.818098 | 5.000000 | ... | 0.0 | 2.689580 | 0.160144 | 85.652478 | 85.652478 | 0.0 | 0.0 | 7.359546 | 2.689580 | 0.182809 |
| 1 | 1 | 75.081571 | 0.008476 | 0.046281 | 0.031165 | 25.760326 | 25.760326 | 0.0 | 0.847557 | 5.000000 | ... | 0.0 | 3.113848 | 0.158708 | 94.641082 | 94.641082 | 0.0 | 0.0 | 7.720256 | 3.113848 | 0.192492 |
| 2 | 2 | 75.081571 | 0.009543 | 0.044505 | 0.029795 | 25.936958 | 25.936958 | 0.0 | 0.954344 | 5.000000 | ... | 0.0 | 3.550284 | 0.154540 | 96.488899 | 96.488899 | 0.0 | 0.0 | 8.160043 | 3.550284 | 0.199467 |
| 3 | 3 | 95.000000 | 0.009808 | 0.050228 | 0.038292 | 25.987232 | 25.987232 | 0.0 | 0.980768 | 25.000000 | ... | 0.0 | 3.725214 | 0.140574 | 98.706304 | 98.706304 | 0.0 | 0.0 | 8.143257 | 3.725214 | 0.200820 |
| 4 | 4 | 102.000000 | 0.014454 | 0.046804 | 0.043539 | 24.234888 | 24.234888 | 0.0 | 1.445428 | 31.918429 | ... | 0.0 | 5.556859 | 0.123630 | 93.169561 | 93.169561 | 0.0 | 0.0 | 9.755374 | 5.556859 | 0.194049 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4152 | 4152 | 12.859255 | 0.013560 | 28.605438 | 54.904340 | 0.000000 | 0.000000 | 0.0 | 1.356008 | 8.696801 | ... | 0.0 | 4.164367 | 28.436025 | 0.000000 | 0.000000 | 0.0 | 0.0 | 40.786449 | 4.164367 | 35.238399 |
| 4153 | 4153 | 15.250778 | 0.013415 | 30.852146 | 65.582725 | 0.000000 | 0.000000 | 0.0 | 1.341542 | 10.215766 | ... | 0.0 | 4.301449 | 32.659126 | 0.000000 | 0.000000 | 0.0 | 0.0 | 45.379721 | 4.301449 | 39.685420 |
| 4154 | 4154 | 12.143988 | 0.014226 | 23.547916 | 51.637736 | 0.000000 | 0.000000 | 0.0 | 1.422627 | 8.917877 | ... | 0.0 | 4.537564 | 19.952731 | 0.000000 | 0.000000 | 0.0 | 0.0 | 33.277787 | 4.537564 | 27.364712 |
| 4155 | 4155 | 16.383415 | 0.013609 | 29.652388 | 68.183922 | 0.000000 | 0.000000 | 0.0 | 1.360873 | 11.454327 | ... | 0.0 | 4.327241 | 28.367113 | 0.000000 | 0.000000 | 0.0 | 0.0 | 41.151269 | 4.327241 | 35.435537 |
| 4156 | 4156 | 20.523394 | 0.012605 | 34.185703 | 84.794081 | 0.000000 | 0.000000 | 0.0 | 1.260457 | 14.070116 | ... | 0.0 | 4.505345 | 38.421620 | 0.000000 | 0.000000 | 0.0 | 0.0 | 51.679572 | 4.505345 | 45.780973 |
2153 rows × 192 columns
zero_data = data_cleaned.eq(0).sum()
cols_to_keep=zero_data[zero_data <= 500].index
data_cleaned = data_cleaned[cols_to_keep]
data_cleaned.shape
(2153, 325)
data_dict=pd.read_csv("/Users/sahithirao/Desktop/Sem3/DataViz/Project/Data set/dei-global-health-expenditure-world_dataset_Codebook-dei-global-health-expenditure-world.csv")
cleaned_column_names=data_cleaned.columns
filtered_data_dict = data_dict[data_dict['variable code'].isin(cleaned_column_names)]
file=r"/Users/sahithirao/Desktop/Sem3/DataViz/Project/Data set/filtered_data.csv"
df = pd.read_csv(file)
df.head()
| Unnamed: 0.1 | Unnamed: 0 | country | code | region | income | year | che_gdp | che_pc_usd | che | ... | hf121_ppp2020_pc | hf122_ppp2020_pc | hf13_ppp2020_pc | hf2_ppp2020_pc | hf21_ppp2020_pc | hf22_ppp2020_pc | hf3_ppp2020_pc | gdp_ppp2020_pc | pfc_ppp2020_pc | gge_ppp2020_pc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | Algeria | DZA | AFR | Lower-middle | 2000 | 3.489033 | 62.117695 | 143870.265625 | ... | 85.652478 | 0.0 | 0.0 | 7.359546 | 2.689580 | 0.182809 | 84.802063 | 9422.670850 | 3917.143779 | 2692.144010 |
| 1 | 1 | 1 | Algeria | DZA | AFR | Lower-middle | 2001 | 3.837877 | 67.338501 | 162230.890625 | ... | 94.641082 | 0.0 | 0.0 | 7.720256 | 3.113848 | 0.192492 | 79.608176 | 9572.762464 | 4184.332806 | 2991.622448 |
| 2 | 2 | 2 | Algeria | DZA | AFR | Lower-middle | 2002 | 3.730042 | 66.947601 | 168702.312500 | ... | 96.488899 | 0.0 | 0.0 | 8.160043 | 3.550284 | 0.199467 | 83.795529 | 9973.432108 | 4386.695961 | 3419.400063 |
| 3 | 3 | 3 | Algeria | DZA | AFR | Lower-middle | 2003 | 3.601041 | 76.235474 | 189137.484375 | ... | 98.706304 | 0.0 | 0.0 | 8.143257 | 3.725214 | 0.200820 | 80.328056 | 10547.676227 | 4270.038642 | 3396.579475 |
| 4 | 4 | 4 | Algeria | DZA | AFR | Lower-middle | 2004 | 3.544073 | 93.024330 | 217928.593750 | ... | 93.169561 | 0.0 | 0.0 | 9.755374 | 5.556859 | 0.194049 | 100.318061 | 10847.517657 | 4182.638819 | 3337.235958 |
5 rows × 416 columns
#renaming for income level per capita
df.rename(columns={'hf11_usd_pc':'Govn_schemes_USD_PC'},inplace=True)
#renaming for highest healthcare expenditure
df.rename(columns={'che_pc_usd':'current healthcare expenditure(USD)'},inplace=True)
#renaming for total current healthcare expenditure
df.rename(columns={'gghed_che':'Domestic General Government Health Expenditure(GGHE-D) as %(CHE)'},inplace=True)
df.rename(columns={'pvtd_che':'Domestic Private Health Expenditure (PVT-D) as %(CHE)'},inplace=True)
#renaming for population and current healthcare expenditure
df.rename(columns={'pop':'Population'},inplace=True)
#renaming for health spending trends
df.rename(columns={'che_gdp':'Current Health Expenditure(GDP)'},inplace=True)
df.shape
(2153, 416)
df.columns
Index(['Unnamed: 0.1', 'Unnamed: 0', 'country', 'code', 'region', 'income',
'year', 'Current Health Expenditure(GDP)',
'current healthcare expenditure(USD)', 'che',
...
'hf121_ppp2020_pc', 'hf122_ppp2020_pc', 'hf13_ppp2020_pc',
'hf2_ppp2020_pc', 'hf21_ppp2020_pc', 'hf22_ppp2020_pc',
'hf3_ppp2020_pc', 'gdp_ppp2020_pc', 'pfc_ppp2020_pc', 'gge_ppp2020_pc'],
dtype='object', length=416)
df.describe()
| Unnamed: 0.1 | Unnamed: 0 | year | Current Health Expenditure(GDP) | current healthcare expenditure(USD) | che | gghed | pvtd | ext | dom_che | ... | hf121_ppp2020_pc | hf122_ppp2020_pc | hf13_ppp2020_pc | hf2_ppp2020_pc | hf21_ppp2020_pc | hf22_ppp2020_pc | hf3_ppp2020_pc | gdp_ppp2020_pc | pfc_ppp2020_pc | gge_ppp2020_pc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 | 2.153000e+03 | 2.153000e+03 | 2.153000e+03 | 2.153000e+03 | 2153.000000 | ... | 2153.000000 | 2153.000000 | 2153.0 | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 | 2153.000000 |
| mean | 1602.274965 | 1602.274965 | 2010.376219 | 6.045880 | 624.941548 | 4.505470e+06 | 2.178595e+06 | 2.270971e+06 | 5.589772e+04 | 89.032424 | ... | 171.798315 | 40.279037 | 0.0 | 106.184199 | 79.298952 | 20.625910 | 249.115758 | 13650.316300 | 8130.219580 | 4639.690688 |
| std | 1133.148156 | 1133.148156 | 5.979335 | 2.517597 | 1419.583800 | 7.697960e+07 | 3.823974e+07 | 3.892272e+07 | 2.318279e+05 | 15.475841 | ... | 555.093395 | 332.038349 | 0.0 | 296.066504 | 258.771992 | 48.894818 | 300.911687 | 16705.256744 | 8709.085627 | 6848.013155 |
| min | 0.000000 | 0.000000 | 2000.000000 | 1.505008 | 4.487679 | 5.842166e-02 | 2.679574e-02 | 3.159592e-02 | 0.000000e+00 | 18.419533 | ... | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.218731 | 339.582337 | 264.420668 | 19.122773 |
| 25% | 712.000000 | 712.000000 | 2005.000000 | 4.234925 | 49.006382 | 2.212096e+03 | 1.081000e+03 | 7.693908e+02 | 9.300000e+00 | 84.201782 | ... | 0.000000 | 0.000000 | 0.0 | 11.328357 | 0.940935 | 2.254654 | 46.398863 | 3107.982261 | 2227.261391 | 622.654150 |
| 50% | 1343.000000 | 1343.000000 | 2011.000000 | 5.417539 | 166.885742 | 3.047490e+04 | 1.070509e+04 | 1.307649e+04 | 5.146395e+02 | 96.306213 | ... | 1.669358 | 0.000000 | 0.0 | 30.605346 | 6.117801 | 6.857729 | 123.990363 | 8406.713257 | 5344.835598 | 2388.303190 |
| 75% | 2303.000000 | 2303.000000 | 2016.000000 | 7.486594 | 465.081573 | 2.571768e+05 | 9.786816e+04 | 1.141192e+05 | 1.342176e+04 | 99.641861 | ... | 60.777451 | 0.000000 | 0.0 | 82.623259 | 57.792185 | 19.783227 | 357.985284 | 16413.652151 | 10382.344891 | 5166.580140 |
| max | 4156.000000 | 4156.000000 | 2021.000000 | 20.413412 | 11702.409180 | 2.102209e+09 | 1.132999e+09 | 9.690000e+08 | 2.371629e+06 | 100.907066 | ... | 5452.208601 | 3528.416516 | 0.0 | 3661.458751 | 3150.462151 | 510.854274 | 2355.271547 | 125106.034585 | 70040.684568 | 55962.805366 |
8 rows × 412 columns
df.isnull().sum()
Unnamed: 0.1 0
Unnamed: 0 0
country 0
code 0
region 0
..
hf22_ppp2020_pc 0
hf3_ppp2020_pc 0
gdp_ppp2020_pc 0
pfc_ppp2020_pc 0
gge_ppp2020_pc 0
Length: 416, dtype: int64
df['region'].value_counts()
AFR 796 AMR 629 EUR 299 EMR 175 WPR 133 SEAR 121 Name: region, dtype: int64
# Create a dictionary to map region codes to names
region_mapping = {
'AFR': 'Africa',
'AMR': 'America',
'EUR': 'Europe',
'EMR': 'Eastern Mediterranean',
'WPR': 'Western Pacific',
'SEAR': 'South-East Asia'
}
df['region'] = df['region'].replace(region_mapping)
import pandas as pd
import plotly.express as px
# Assuming 'df' is your DataFrame
# If not, replace it with your actual DataFrame
# Define the columns of interest
columns_of_interest = [
'Govn_schemes_USD_PC',
'current healthcare expenditure(USD)',
'Domestic General Government Health Expenditure(GGHE-D) as %(CHE)',
'Domestic Private Health Expenditure (PVT-D) as %(CHE)',
'Population',
'Current Health Expenditure(GDP)'
]
selected_columns = df[columns_of_interest]
correlation_matrix = selected_columns.corr()
fig = px.imshow(
correlation_matrix,
x=correlation_matrix.columns,
y=correlation_matrix.columns,
color_continuous_scale='Viridis'
)
fig.update_layout(
title='Correlation Map for DataFrame',
width=1000,
height=1000
)
fig.show()
Distribution Analysis
1) What is the healthcare expenditure distribution for different categories of financing schemes in different countries of the world? - Tree map
2) How much government spends on schemes based on Income level per capita? - Dot distribution map
Trend Analysis
3) How have health spending trends changed over the years in different parts of the world? - Line charts time series and heat map
4) What percent of the total current healthcare expenditure does the government and the private sources contribute in the year 2020 for all the countries? - Bubble chart
Relationship Analysis
5) What is the relationship between population and current healthcare expenditure per capita for the USA over the observed years? - Bar chart
6) How has the evolving relationship between purchasing power parity and exchange rates worldwide since the year 2000 impacted global economic dynamics?
Ranking/Comparison Analysis
7) Which are the top 10 countries that exhibit the highest healthcare expenditure in a given year? - Area chart
8) How does health expenditure vary across income groups within different regions, and what changes have occurred from 2010 to 2020? - Sunburst charts
#checking Null
df['Govn_schemes_USD_PC'].isnull().sum()
#dropping null
data_clean_usd= df.dropna(subset=['Govn_schemes_USD_PC'])
#selecting required columns
usd=data_clean_usd[['country','year','code','income','Govn_schemes_USD_PC']]
usd=usd.round(2)
usd.head()
| country | year | code | income | Govn_schemes_USD_PC | |
|---|---|---|---|---|---|
| 0 | Algeria | 2000 | DZA | Lower-middle | 28.52 |
| 1 | Algeria | 2001 | DZA | Lower-middle | 33.99 |
| 2 | Algeria | 2002 | DZA | Lower-middle | 33.04 |
| 3 | Algeria | 2003 | DZA | Lower-middle | 38.67 |
| 4 | Algeria | 2004 | DZA | Lower-middle | 43.85 |
#scatter plot for Government Spending on Schemes in USD Per Capita by Country and Income Group
#creating dot plot
fig = px.scatter_geo(usd,
locations='code',
size='Govn_schemes_USD_PC',
color='income',
hover_name='country',
hover_data=['Govn_schemes_USD_PC'],
title='Government Spending on Schemes in USD Per Capita by Country and Income Group',
projection='natural earth',
animation_frame='year', # creates a slider based on the 'year' column
size_max=30,
color_discrete_sequence=px.colors.qualitative.Plotly)
#setting minimum size
fig.update_traces(marker=dict(sizemin=3))
fig.update_layout(
geo=dict(
showframe=False,
showcoastlines=False,
)
)
fig.update_traces(marker=dict(line=dict(width=0)))
fig.show()
df['current healthcare expenditure(USD)'].isnull().sum()
0
df_che=df[['country','year','income','current healthcare expenditure(USD)']].copy()
df_che=df_che.round(2)
df_che.dropna(subset=['current healthcare expenditure(USD)'],inplace=True)
#line plot for an option to the user to choose the year in flask
selected_year = 2020
df_selected = df_che[df_che['year'] == selected_year]
df_selected_top10 = df_selected.nlargest(10, 'current healthcare expenditure(USD)')
fig_bar = px.line(df_selected_top10, x='country', y='current healthcare expenditure(USD)', title='Top 10 countries with current healthcare expenditure per capita(in USD)'
)
fig_bar.update_layout(
xaxis=dict(tickangle=45),
xaxis_title='Country',
yaxis_title='healthcare expenditure per capita',
xaxis_categoryorder='total descending'
)
fig_bar.add_trace(px.scatter(df_selected_top10, x='country', y='current healthcare expenditure(USD)',
color_discrete_sequence=['red']).data[0])
#bubble chart for the year 2020
data_for_2020 = df[df['year'] == 2010]
fig6 = px.scatter(data_for_2020,
x='Domestic General Government Health Expenditure(GGHE-D) as %(CHE)',
y='Domestic Private Health Expenditure (PVT-D) as %(CHE)',
size='current healthcare expenditure(USD)',
color='Domestic General Government Health Expenditure(GGHE-D) as %(CHE)',
hover_name='country',
labels={'Domestic General Government Health Expenditure(GGHE-D) as %(CHE)': 'Government Health Expenditure %', 'Domestic Private Health Expenditure (PVT-D) as %(CHE)': 'Private Health Expenditure %'},
title='Healthcare Financing by Government and Private Sources',
size_max=60)
fig6.update_layout(xaxis_title='Government Health Expenditure %',
yaxis_title='Private Health Expenditure %',
coloraxis_colorbar=dict(title='Government Health Expenditure%'))
#bar plot for the column population and current healthcare expenditure
df_usa = df[df['country'] == 'United States of America']
fig_n = px.bar(df_usa, x='year', y=['Population', 'current healthcare expenditure(USD)'],
barmode='group',
title='Realtionship between Population(in Thousands) and Current Healthcare Expenditure per Capita',
labels={'value': 'Amount'},
height=500,
color_discrete_map={'Population': '#003366', 'current healthcare expenditure(USD)': '#FFA500'})
fig_n.show()
health_data = pd.DataFrame(df)
# Group by year and region and calculate the average CHE as % GDP
avg_che_gdp_data = health_data.groupby(['year', 'region']).mean().reset_index()
avg_che_gdp_data['che_gdp_smoothed'] = avg_che_gdp_data.groupby('region')['Current Health Expenditure(GDP)'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
# Create time series grapH
fig = px.line(avg_che_gdp_data, x='year', y='che_gdp_smoothed', color='region',
line_shape='linear',
title='Smoothed Current Health Expenditure as % GDP Over Time for Each Region')
fig.update_layout(
xaxis_title='Year',
yaxis_title='Smoothed Current Health Expenditure as % of GDP',
legend_title='Region',
)
fig.show()
#HEATMAP
fig = px.imshow(avg_che_gdp_data.pivot_table(index='region', columns='year', values='Current Health Expenditure(GDP)'),
x=list(avg_che_gdp_data['year'].unique()),
y=list(avg_che_gdp_data['region'].unique()),
title='Heatmap of Current Health Expenditure as % GDP by Region and Year',
color_continuous_scale='YlGnBu')
fig.show()
#asigning colour to each region
region_colors = {
'Africa': '#EC7063',
'America': '#AF7AC5',
'Eastern Mediterranean': '#48C9B0',
'Europe': '#7FB3D5',
'South-East Asia': '#F9E79F',
'Western Pacific': '#F5B7B1'
}
# Filter data for the year 2010
df_10 = df[df['year'] == 2010]
fig = px.sunburst(df_10, path=['region','income','country'], values='Current Health Expenditure(GDP)',color='region',color_discrete_map=region_colors,
title='2010 Health Expenditure as % of GDP Across Income Groups (Sunburst Chart)')
fig.show()
/Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
# Filter data for the year 2020
df_20 = df[df['year'] == 2020]
fig = px.sunburst(df_20, path=['region','income','country'], values='Current Health Expenditure(GDP)',color='region',color_discrete_map=region_colors,
title='2020 Health Expenditure as % of GDP Across Income Groups (Sunburst Chart)')
fig.show()
/Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
selected_columns = ['country', 'hf_usd', 'hf1_usd', 'hf11_usd', 'hf2_usd', 'hf21_usd', 'hf22_usd', 'hf3_usd']
# Create a new health expenditure DataFrame with selected columns
data_health_exp = df[selected_columns]
data_health_exp.isna().sum()
country 0 hf_usd 0 hf1_usd 0 hf11_usd 0 hf2_usd 0 hf21_usd 0 hf22_usd 0 hf3_usd 0 dtype: int64
zeroes=data_health_exp.columns[data_health_exp.eq(0).any()]
data_health_exp[zeroes]
| hf2_usd | hf21_usd | hf22_usd | |
|---|---|---|---|
| 0 | 42.793716 | 15.639162 | 1.062985 |
| 1 | 44.150452 | 17.807416 | 1.100822 |
| 2 | 46.440352 | 20.205341 | 1.135207 |
| 3 | 52.393583 | 23.967965 | 1.292074 |
| 4 | 76.740908 | 43.713175 | 1.526492 |
| ... | ... | ... | ... |
| 2148 | 2.694593 | 0.275123 | 2.328056 |
| 2149 | 3.171449 | 0.300615 | 2.773491 |
| 2150 | 2.405740 | 0.328033 | 1.978268 |
| 2151 | 3.123177 | 0.328416 | 2.689382 |
| 2152 | 3.774666 | 0.329070 | 3.343834 |
2153 rows × 3 columns
# Melt the DataFrame to long format for better usage with Plotly Express
melted_data = pd.melt(data_health_exp, id_vars='country', var_name='financing_source', value_name='expenditure')
# Convert 'expenditure' column to numerical type and handle potential NaN values
melted_data['expenditure'] = pd.to_numeric(melted_data['expenditure'], errors='coerce')
# Remove negative or zero values
melted_data = melted_data[melted_data['expenditure'] > 0]
financing_source_labels = {
'hf_usd': 'Total Health Expenditure',
'hf1_usd': 'Government Schemes & Compulsory Contributions',
'hf11_usd': 'Government Schemes',
'hf2_usd': 'Voluntary Health Payment Schemes',
'hf21_usd': 'Voluntary Health Insurance Schemes',
'hf22_usd': 'NPISH Financing Schemes (including development agencies)',
'hf3_usd': 'Household Out-of-Pocket Payment',
}
melted_data['financing_source'] = melted_data['financing_source'].map(financing_source_labels)
# Create a TreeMap using Plotly Express
fig = px.treemap(melted_data.dropna(), # Drop rows with NaN values
path=['country', 'financing_source'],
values='expenditure',
title='Healthcare Expenditure TreeMap',
color='expenditure',
color_continuous_scale='Viridis',
color_continuous_midpoint=melted_data['expenditure'].mean()) # Set midpoint to mean value
# Show the plot
fig.show()
/Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/sahithirao/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
import plotly.graph_objects as go
import plotly.offline as pyo
spider_data = ['year', 'ppp', 'xrt', 'oops_che', 'region']
# Create a new health expenditure DataFrame with selected columns
data_spider_chart = data[spider_data].copy() # Use copy() to avoid SettingWithCopyWarning
data_spider_chart.rename(columns={'ppp': 'purchasing power parity',
'xrt': 'exchange rates',
'oops_che': 'out-of-pocket expenses'}, inplace=True)
def get_yearwise_value(df, column_name):
years = df.year.unique()
yearwise_sum = []
for year in years:
yearwise_sum.append(df.loc[df['year'] == year, column_name].sum())
return years, yearwise_sum
years, yearwise_ppp = get_yearwise_value(data_spider_chart, 'purchasing power parity')
# years, yearwise_xrt = get_yearwise_value(data_spider_chart, 'exchange rates')
years, yearwise_oops_che = get_yearwise_value(data_spider_chart, 'out-of-pocket expenses')
years = [*years, years[0]]
yearwise_ppp = [*yearwise_ppp, yearwise_ppp[0]]
# yearwise_xrt = [*yearwise_xrt, yearwise_xrt[0]]
yearwise_oops_che = [*yearwise_oops_che, yearwise_oops_che[0]]
fig = go.Figure(
data=[
go.Scatterpolar(r=yearwise_ppp, theta=[str(year) for year in years], name='purchasing power parity'),
# go.Scatterpolar(r=yearwise_xrt, theta=[str(year) for year in years], name='exchange rates')
go.Scatterpolar(r=yearwise_oops_che, theta=[str(year) for year in years], name='out-of-pocket expenses')
],
layout=go.Layout(
title=go.layout.Title(text='Purchasing Power Parity and Out-of-pocket Expenses'),
polar={'radialaxis': {'visible': True}},
showlegend=True
)
)
fig.show()
# pyo.plot(fig)